library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
df <- readr::read_tsv("decomposition_proc.tsv")
## Rows: 394 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (14): vacc_name, vacc_vocab, p1, p2, p3, p4, p5, p6, m1, m2, m3, m4, m5, m6
## dbl (1): vacc_id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df2 <- df %>%
select(vacc_id, matches("^p|^m")) %>%
tidyr::gather("key", "value", -vacc_id) %>%
tidyr::separate("key", into = c("key", "num"), sep = -1) %>%
tidyr::spread("key", "value") %>%
rename(disease = p, mechanism = m) %>%
filter(!(is.na(disease) & is.na(mechanism))) %>%
# remove concepts without a disease
filter(!is.na(disease)) %>%
arrange(vacc_id, num)
# df2 <- df2 %>%
# filter(disease %in% c("covid-19", "adenovirus"))
# df2 <- df2 %>%
# mutate(mechanism = NA_character_)
# single mechanism concepts
mechanism_single <- df2 %>%
filter(!is.na(mechanism)) %>%
select(disease, mechanism) %>%
distinct()
# single disease concepts
disease_single <- df2 %>%
mutate(mechanism = NA_character_) %>%
filter(!is.na(disease)) %>%
select(disease, mechanism) %>%
distinct()
# combination mechanism concepts
mechanism_combos <- df2 %>%
filter(!is.na(mechanism)) %>%
group_by(vacc_id) %>%
summarise(disease = str_c(disease, collapse = "|"), mechanism = str_c(mechanism, collapse = "|"), n = n()) %>%
ungroup() %>%
filter(n > 1) %>%
select(disease, mechanism) %>%
distinct()
# combination disease concepts
disease_combos <- df2 %>%
select(vacc_id, disease) %>%
distinct() %>%
group_by(vacc_id) %>%
summarise(disease = str_c(disease, collapse = "|"), n = n()) %>%
ungroup() %>%
filter(n > 1) %>%
mutate(mechanism = NA_character_) %>%
select(disease, mechanism) %>%
distinct()
# combine all expanded concepts into a single dataframe
expanded <- bind_rows(select(df2, disease, mechanism),
mechanism_single,
mechanism_combos,
disease_single,
disease_combos) %>%
distinct() %>%
arrange(disease, mechanism) %>%
mutate(id = row_number()) %>%
select(id, everything())
# reformat the concepts into a formal context
fc <- expanded %>%
pivot_longer(cols = c(disease, mechanism)) %>%
# select(id, value) %>%
mutate(value = str_split(value, "\\|")) %>%
unnest(cols = "value") %>%
mutate(value = str_trim(value)) %>%
mutate(value = str_replace_all(value, "\\s|-", "_")) %>%
mutate(value = str_remove_all(value, "\\(|\\)|,")) %>%
mutate(x = "X") %>%
distinct() %>%
filter(!is.na(value)) %>%
mutate(value = paste0(toupper(str_sub(name, 1, 1)), "_", value)) %>%
select(-name) %>%
pivot_wider(names_from = "value", values_from = "x", values_fill = "")
write_csv(fc, "formal_context.csv")
def boiler(csv_filename, output_path):
from concepts import Context
import pandas as pd
import os
print("Using working directory: " + os.getcwd())
output_path = os.getcwd() + "/" + output_path
# create the context object from the csv file
c = Context.fromfile(os.getcwd() + "/" + csv_filename, frmat='csv')
# use the attributes (intent) to define the concept name
def get_concept_name(concept):
nm = "; ".join(list(concept.intent))
return nm
concept_list = [a for a in c.lattice]
concept_names = [get_concept_name(a) for a in concept_list]
print(len(concept_list))
maps_to_list = []
for idx, con in enumerate(concept_list):
parent_concept_indexes = [concept_list.index(c) for c in list(con.upper_neighbors)]
for parent_idx in parent_concept_indexes:
maps_to_list.append((idx, parent_idx))
# create the concept table. Make sure concept ids start with 0 which need to be fixed.
concept_df = pd.DataFrame({"id" : range(len(concept_names)), "concept_name" : concept_names})
# create the 'Is a' relationship table. Add 1 to concept ids so they start with 1 instead of 0.
concept_relationship_df = pd.DataFrame({"id_1" : [x for x, _ in maps_to_list],
"relationship" : "Is a",
"id_2" : [x for _ , x in maps_to_list]})
concept_df.to_csv(output_path + "concept.csv", index = False)
concept_relationship_df.to_csv(output_path + "concept_relationship.csv", index = False)
# Run the boiler function using the formal context as input
boiler('formal_context.csv', 'new_vaccine_vocab_')
## Using working directory: /Users/adamblack/projects/FCA_boiler/cvx_hcpcs_icdproc_experiment
## 108
concept <- read_csv("new_vaccine_vocab_concept.csv") %>%
mutate(concept_name = ifelse(is.na(concept_name), "Vaccine", concept_name)) %>%
# filter(!is.na(concept_name)) #%>%
# mutate(concept_name = ifelse(id == 0, "Vaccine", concept_name))
{.}
## Rows: 108 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): concept_name
## dbl (1): id
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cr <- read_csv("new_vaccine_vocab_concept_relationship.csv") %>%
filter(id_1 %in% concept$id, id_2 %in% concept$id)
## Rows: 214 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): relationship
## dbl (2): id_1, id_2
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
concept
## # A tibble: 108 × 2
## id concept_name
## <dbl> <chr>
## 1 0 D_adenovirus; M_adenovirus_live; D_anthrax; D_cholera; D_covid_19; M_c…
## 2 1 D_adenovirus; M_adenovirus_live
## 3 2 D_anthrax
## 4 3 D_cholera
## 5 4 D_covid_19; M_covid_19_mRNA
## 6 5 D_covid_19; M_covid_19_vector
## 7 6 D_dengue
## 8 7 D_diphtheria; M_diphtheria_antitoxin
## 9 8 D_diphtheria; M_diphtheria_toxoid; D_tetanus; M_tetanus_toxoid; D_pert…
## 10 9 D_diphtheria; M_diphtheria_toxoid; D_tetanus; M_tetanus_toxoid; D_pert…
## # … with 98 more rows
cr %>%
left_join(rename(concept, concept_name_1 = concept_name), by = c("id_1" = "id")) %>%
left_join(rename(concept, concept_name_2 = concept_name), by = c("id_2" = "id")) %>%
select(concept_name_1, relationship, concept_name_2)
## # A tibble: 214 × 3
## concept_name_1 relationship concept_name_2
## <chr> <chr> <chr>
## 1 D_adenovirus; M_adenovirus_live… Is a D_adenovirus; M_adenovirus_live
## 2 D_adenovirus; M_adenovirus_live… Is a D_anthrax
## 3 D_adenovirus; M_adenovirus_live… Is a D_cholera
## 4 D_adenovirus; M_adenovirus_live… Is a D_covid_19; M_covid_19_mRNA
## 5 D_adenovirus; M_adenovirus_live… Is a D_covid_19; M_covid_19_vector
## 6 D_adenovirus; M_adenovirus_live… Is a D_dengue
## 7 D_adenovirus; M_adenovirus_live… Is a D_diphtheria; M_diphtheria_ant…
## 8 D_adenovirus; M_adenovirus_live… Is a D_diphtheria; M_diphtheria_tox…
## 9 D_adenovirus; M_adenovirus_live… Is a D_diphtheria; M_diphtheria_tox…
## 10 D_adenovirus; M_adenovirus_live… Is a D_diphtheria; D_tetanus; D_per…
## # … with 204 more rows
Visualize Boiler Output
plt <- g %>%
# activate(nodes) %>%
# filter(str_detect(name, "D_tetanus")) %>%
ggraph('fr') +
geom_edge_link(arrow = arrow(angle = 20, length = unit(0.15, "inches"), ends = "last", type = "open")) +
geom_node_point() +
coord_fixed() +
ggraph::geom_node_text(aes(label = display_name), repel = T, force = 100)
# plt
ggsave("vaccines.png", plt, width = 30, height = 30)
plt <- g %>%
activate(nodes) %>%
filter(str_detect(name, "D_tetanus")) %>%
ggraph('fr') +
geom_edge_link(arrow = arrow(angle = 20, length = unit(0.15, "inches"), ends = "last", type = "open")) +
geom_node_point() +
coord_fixed() +
ggraph::geom_node_text(aes(label = display_name), repel = T, force = 100)
ggsave("tetanus.png", plt, width = 30, height = 30)
plt <- g %>%
activate(nodes) %>%
filter(str_detect(name, "D_measles")) %>%
ggraph('fr') +
geom_edge_link(arrow = arrow(angle = 20, length = unit(0.15, "inches"), ends = "last", type = "open")) +
geom_node_point() +
coord_fixed() +
ggraph::geom_node_text(aes(label = display_name), repel = T, force = 100)
ggsave("measles.png", plt, width = 10, height = 10)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:plotly':
##
## groups
## The following object is masked from 'package:tidygraph':
##
## groups
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(igraphdata)
data(karate, package="igraphdata")
# G <- upgrade_graph(karate)
G <- g
# L <- layout.circle(G)
L <- layout.auto(G)
nm <- g %>%
activate(nodes) %>%
pull(display_name)
# Create Vertices and Edges
vs <- V(G)
es <- as.data.frame(get.edgelist(G))
Nv <- length(vs)
Ne <- length(es[1]$V1)
# Create Nodes
library(plotly)
Xn <- L[,1]
Yn <- L[,2]
names(Xn) <- g %>% activate(nodes) %>% pull(name)
names(Yn) <- g %>% activate(nodes) %>% pull(name)
# network <- plot_ly(x = ~Xn, y = ~Yn, mode = "markers", text = vs$label, hoverinfo = "text")
network <- plot_ly(x = ~Xn, y = ~Yn, mode = "markers", text = nm, hoverinfo = "text")
# Creates Edges
i=1
edge_shapes <- list()
for(i in 1:Ne) {
v0 <- es[i,]$V1
v1 <- es[i,]$V2
edge_shape = list(
type = "line",
line = list(color = "#030303", width = 0.3),
x0 = Xn[v0],
y0 = Yn[v0],
x1 = Xn[v1],
y1 = Yn[v1]
)
edge_shapes[[i]] <- edge_shape
}
# Create Network
axis <- list(title = "", showgrid = FALSE, showticklabels = FALSE, zeroline = FALSE)
fig <- layout(
network,
title = 'FCA Boiler Vaccine Graph',
shapes = edge_shapes,
xaxis = axis,
yaxis = axis
)
# plotly::add_annotations(fig, "",
# x=30, # arrows' head
# y=30, # arrows' head
# ax=40, # arrows' tail
# ay=40, # arrows' tail
# xref='x',
# yref='y',
# axref='x',
# ayref='y',
# text='', # if you want only the arrow
# showarrow=T,
# arrowhead=3,
# arrowsize=1,
# arrowwidth=1,
# arrowcolor='black')
fig
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter